In [1]:
%matplotlib inline
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from collections import Counter

from IPython.display import HTML, display
import statsmodels.api as sm


/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
              "MP", "PR", "VI"])


STATE_POPULATIONS="""4863300.00	AL
741894.00	AK
6931071.00	AZ
2988248.00	AR
39250017.00	CA
5540545.00	CO
3576452.00	CT
952065.00	DE
681170.00	DC
20612439.00	FL
10310371.00	GA
1428557.00	HI
1683140.00	ID
12801539.00	IL
6633053.00	IN
3134693.00	IA
2907289.00	KS
4436974.00	KY
4681666.00	LA
1331479.00	ME
6016447.00	MD
6811779.00	MA
9928300.00	MI
5519952.00	MN
2988726.00	MS
6093000.00	MO
1042520.00	MT
1907116.00	NE
2940058.00	NV
1334795.00	NH
8944469.00	NJ
2081015.00	NM
19745289.00	NY
10146788.00	NC
757952.00	ND
11614373.00	OH
3923561.00	OK
4093465.00	OR
12784227.00	PA
1056426.00	RI
4961119.00	SC
865454.00	SD
6651194.00	TN
27862596.00	TX
3051217.00	UT
624594.00	VT
8411808.00	VA
7288000.00	WA
1831102.00	WV
5778708.00	WI
585501.00	WY
""".splitlines()

STATE_POPULATIONS = {k:float(v) for v,k in map(lambda x: x.split('\t'), STATE_POPULATIONS)}


STATE_NAMES_ABBR="""AK	Alaska
AL	Alabama
AR	Arkansas
AZ	Arizona
CA	California
CO	Colorado
CT	Connecticut
DC	Dist. of Col.
DE	Delaware
FL	Florida
GA	Georgia
GU	Guam
HI	Hawaii
IA	Iowa
ID	Idaho
IL	Illinois
IN	Indiana
KS	Kansas
KY	Kentucky
LA	Louisiana
MA	Massachusetts
MD	Maryland
ME	Maine
MI	Michigan
MN	Minnesota
MO	Missouri
MS	Mississippi
MT	Montana
NC	North Carolina
ND	North Dakota
NE	Nebraska
NH	New Hampshire
NJ	New Jersey
NM	New Mexico
NV	Nevada
NY	New York
OH	Ohio
OK	Oklahoma
OR	Oregon
PA	Pennsylvania
PR	Puerto Rico
RI	Rhode Island
SC	South Carolina
SD	South Dakota
TN	Tennessee
TX	Texas
UT	Utah
VA	Virginia
VI	Virgin Islands
VT	Vermont
WA	Washington
WI	Wisconsin
WV	West Virginia
WY	Wyoming""".splitlines()
STATE_NAMES_ABBR = {
    k: v for v,k in map(lambda x: x.split("\t"), STATE_NAMES_ABBR)
}

## Source: http://www.presidency.ucsb.edu/showelection.php?year=2016

STATE_VOTES="""STATE	TOTAL VOTES	Hillary Votes	%	EV	Trump Votes	%	EV	Other Votes	%	EV
Alabama	2,123,372	729,547	34.4%	 	1,318,255	62.1%	9	44,467	2.1%	 
Alaska	318,608	116,454	36.6%	 	163,387	51.3%	3	18,725	5.9%	 
Arizona	2,573,165	1,161,167	45.1%	 	1,252,401	48.7%	11	106,327	4.1%	 
Arkansas	1,130,635	380,494	33.7%	 	684,872	60.6%	6	29,829	2.6%	 
California	14,181,595	8,753,788	61.7%	55	4,483,810	31.6%	 	478,500	3.4%	 
Colorado	2,780,220	1,338,870	48.2%	9	1,202,484	43.3%	 	144,121	5.2%	 
Connecticut	1,644,920	897,572	54.6%	7	673,215	40.9%	 	48,676	3.0%	 
Delaware	441,590	235,603	53.4%	3	185,127	41.9%	 	14,757	3.3%	 
Dist. of Col.	311,268	282,830	90.9%	3	12,723	4.1%	 	4,906	1.6%	 
Florida	9,420,039	4,504,975	47.8%	 	4,617,886	49.0%	29	207,043	2.2%	 
Georgia	4,092,373	1,877,963	45.9%	 	2,089,104	51.0%	16	125,306	3.1%	 
Hawaii	428,937	266,891	62.2%	3*	128,847	30.0%	 	15,954	3.7%	 
Idaho	690,255	189,765	27.5%	 	409,055	59.3%	4	28,331	4.1%	 
Illinois	5,536,424	3,090,729	55.8%	20	2,146,015	38.8%	 	209,596	3.8%	 
Indiana	2,734,958	1,033,126	37.8%	 	1,557,286	56.9%	11	133,993	4.9%	 
Iowa	1,566,031	653,669	41.7%	 	800,983	51.1%	6	59,186	3.8%	 
Kansas	1,184,402	427,005	36.1%	 	671,018	56.7%	6	55,406	4.7%	 
Kentucky	1,924,149	628,854	32.7%	 	1,202,971	62.5%	8	53,752	2.8%	 
Louisiana	2,029,032	780,154	38.4%	 	1,178,638	58.1%	8	37,978	1.9%	 
Maine	747,927	357,735	47.8%	3	335,593	44.9%	1	38,105	5.1%	 
Maryland	2,781,446	1,677,928	60.3%	10	943,169	33.9%	 	79,605	2.9%	 
Massachusetts	3,325,046	1,995,196	60.0%	11	1,090,893	32.8%	 	138,018	4.2%	 
Michigan	4,799,284	2,268,839	47.3%	 	2,279,543	47.5%	16	172,136	3.6%	 
Minnesota	2,944,813	1,367,716	46.4%	10	1,322,951	44.9%	 	112,972	3.8%	 
Mississippi	1,209,357	485,131	40.1%	 	700,714	57.9%	6	14,435	1.2%	 
Missouri	2,808,605	1,071,068	38.1%	 	1,594,511	56.8%	10	97,359	3.5%	 
Montana	494,526	177,709	35.9%	 	279,240	56.5%	3	28,037	5.7%	 
Nebraska	844,227	284,494	33.7%	 	495,961	58.7%	2	38,946	4.6%	 
Nevada	1,125,385	539,260	47.9%	6	512,058	45.5%	 	37,384	3.3%	 
New Hampshire	744,296	348,526	46.8%	4	345,790	46.5%	 	30,777	4.1%	 
New Jersey	3,874,046	2,148,278	55.5%	14	1,601,933	41.4%	 	72,477	1.9%	 
New Mexico	798,318	385,234	48.3%	5	319,666	40.0%	 	74,541	9.3%	 
New York	7,721,453	4,556,124	59.0%	29	2,819,534	36.5%	 	176,598	2.3%	 
North Carolina	4,741,564	2,189,316	46.2%	 	2,362,631	49.8%	15	130,126	2.7%	 
North Dakota	344,360	93,758	27.2%	 	216,794	63.0%	3	21,434	6.2%	 
Ohio	5,496,487	2,394,164	43.6%	 	2,841,005	51.7%	18	174,498	3.2%	 
Oklahoma	1,452,992	420,375	28.9%	 	949,136	65.3%	7	83,481	5.7%	 
Oregon	2,001,336	1,002,106	50.1%	7	782,403	39.1%	 	94,231	4.7%	 
Pennsylvania	6,115,402	2,926,441	47.9%	 	2,970,733	48.6%	20	146,715	2.4%	 
Rhode Island	464,144	252,525	54.4%	4	180,543	38.9%	 	14,746	3.2%	 
South Carolina	2,103,027	855,373	40.7%	 	1,155,389	54.9%	9	49,204	2.3%	 
South Dakota	370,093	117,458	31.7%	 	227,721	61.5%	3	20,850	5.6%	 
Tennessee	2,508,027	870,695	34.7%	 	1,522,925	60.7%	11	70,397	2.8%	 
Texas	8,969,226	3,877,868	43.2%	 	4,685,047	52.2%	36*	283,492	3.2%	 
Utah	1,131,430	310,676	27.5%	 	515,231	45.5%	6	39,608	3.5%	 
Vermont	315,067	178,573	56.7%	3	95,369	30.3%	 	10,078	3.2%	 
Virginia	3,982,752	1,981,473	49.8%	13	1,769,443	44.4%	 	118,274	3.0%	 
Washington	3,209,214	1,742,718	54.3%	8*	1,221,747	38.1%	 	160,879	5.0%	 
West Virginia	713,051	188,794	26.5%	 	489,371	68.6%	5	23,004	3.2%	 
Wisconsin	2,976,150	1,382,536	46.5%	 	1,405,284	47.2%	10	106,674	3.6%	 
Wyoming	255,849	55,973	21.9%	 	174,419	68.2%	3	13,287	5.2%""".splitlines()
STATE_VOTES = list(map(lambda x: x.replace(",", "").replace("%", "").split("\t"), STATE_VOTES))

STATE_COLORS={
    STATE_NAMES_ABBR[x[0]]: "b" if int(x[2]) > int(x[5]) else "r"
    for x in STATE_VOTES[1:]
}

print(STATE_COLORS)



CHOROGRID_STATES_FILE='/content/Code/smishra8/chorogrid/chorogrid/databases/usa_states.csv'


def logit_transform(p):
    eps = 1e-8
    return np.log((p + eps)/(1-p + eps))


{'WA': 'b', 'DE': 'b', 'DC': 'b', 'WI': 'r', 'WV': 'r', 'HI': 'b', 'FL': 'r', 'WY': 'r', 'NH': 'b', 'NJ': 'b', 'NM': 'b', 'TX': 'r', 'LA': 'r', 'NC': 'r', 'ND': 'r', 'NE': 'r', 'TN': 'r', 'NY': 'b', 'PA': 'r', 'CA': 'b', 'NV': 'b', 'VA': 'b', 'CO': 'b', 'AK': 'r', 'AL': 'r', 'AR': 'r', 'VT': 'b', 'IL': 'b', 'GA': 'r', 'IN': 'r', 'IA': 'r', 'OK': 'r', 'AZ': 'r', 'ID': 'r', 'CT': 'b', 'ME': 'b', 'MD': 'b', 'MA': 'b', 'OH': 'r', 'UT': 'r', 'MO': 'r', 'MN': 'b', 'MI': 'r', 'RI': 'b', 'KS': 'r', 'MT': 'r', 'MS': 'r', 'SC': 'r', 'KY': 'r', 'OR': 'b', 'SD': 'r'}

In [4]:
STATE_POPULATIONS["AZ"]


Out[4]:
6931071.0

In [5]:
df.columns


Out[5]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')

In [6]:
df.CATS.fillna(0).apply(
    lambda x: Counter(['UNK']) 
    if x == 0 
    else Counter(x)
).apply(lambda x: len(x)).describe()


Out[6]:
count    246869.000000
mean          1.139163
std           0.356983
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           5.000000
Name: CATS, dtype: float64

In [7]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(
    lambda x: Counter(['NONE']) 
    if x == 0 
    else Counter(x)
)
df[df.CATS_Counter.apply(lambda x: len(x)) == 2]["CATS_Counter"].head()


Out[7]:
23     {u'socialmedia': 1, u'videos': 1}
29    {u'twitter': 1, u'socialmedia': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
54    {u'twitter': 1, u'socialmedia': 1}
Name: CATS_Counter, dtype: object

In [8]:
df.u_state.isnull().value_counts().to_frame().assign(props=lambda x: x["u_state"]/x["u_state"].sum())


Out[8]:
u_state props
False 169038 0.684728
True 77831 0.315272

In [9]:
(~df.u_state.isnull() & (df.u_state !="USA")).value_counts().to_frame().assign(props=lambda x: x["u_state"]/x["u_state"].sum())


Out[9]:
u_state props
True 147924 0.5992
False 98945 0.4008

Proportion of urls controlling for party and population


In [10]:
url_types = ["fakenews", "news",]
for ui, url_type in enumerate(url_types):
        print ui, url_type
        for i, topic in enumerate(topic_order[:3]):
            print i, topic
            df_t = (df[
                        (df.u_state != "USA")
                      & (~df.u_state.isnull())
                      & (df.t_n_urls > 0)
                      & (df.topic_name == topic)
                    ]
                    .assign(**{
                        "url_type": lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0) > 0).astype("int"),
                        "population": lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                        "state_color": lambda x: x.u_state.map(STATE_COLORS, "k")
                    })).query("state_color!='k'")
            model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
            display(model.summary2())


0 fakenews
0 Gun Control
Optimization terminated successfully.
         Current function value: 0.386068
         Iterations 6
Model: Logit Pseudo R-squared: 0.006
Dependent Variable: url_type AIC: 8527.2848
Date: 2018-01-25 21:09 BIC: 8549.2115
No. Observations: 11036 Log-Likelihood: -4260.6
Df Model: 2 LL-Null: -4287.5
Df Residuals: 11033 LLR p-value: 2.1182e-12
Converged: 1.0000 Scale: 1.0000
No. Iterations: 6.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.2820 0.4656 -4.9009 0.0000 -3.1946 -1.3694
state_color[T.r] 0.4146 0.0567 7.3075 0.0000 0.3034 0.5258
np.log10(population) 0.0262 0.0660 0.3974 0.6911 -0.1031 0.1556
1 Privacy
Optimization terminated successfully.
         Current function value: 0.214723
         Iterations 7
Model: Logit Pseudo R-squared: 0.001
Dependent Variable: url_type AIC: 9272.1480
Date: 2018-01-25 21:09 BIC: 9296.0862
No. Observations: 21577 Log-Likelihood: -4633.1
Df Model: 2 LL-Null: -4636.0
Df Residuals: 21574 LLR p-value: 0.053933
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.2916 0.4109 -5.5778 0.0000 -3.0969 -1.4864
state_color[T.r] 0.1250 0.0607 2.0604 0.0394 0.0061 0.2439
np.log10(population) -0.0846 0.0591 -1.4307 0.1525 -0.2005 0.0313
2 Vaccine
Optimization terminated successfully.
         Current function value: 0.312787
         Iterations 6
Model: Logit Pseudo R-squared: 0.002
Dependent Variable: url_type AIC: 9300.7812
Date: 2018-01-25 21:09 BIC: 9323.6000
No. Observations: 14858 Log-Likelihood: -4647.4
Df Model: 2 LL-Null: -4656.8
Df Residuals: 14855 LLR p-value: 7.8778e-05
Converged: 1.0000 Scale: 1.0000
No. Iterations: 6.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.5120 0.4729 -5.3114 0.0000 -3.4390 -1.5851
state_color[T.r] 0.2491 0.0573 4.3469 0.0000 0.1368 0.3613
np.log10(population) 0.0214 0.0660 0.3251 0.7451 -0.1078 0.1507
1 news
0 Gun Control
Optimization terminated successfully.
         Current function value: 0.559483
         Iterations 5
Model: Logit Pseudo R-squared: 0.009
Dependent Variable: url_type AIC: 12354.9002
Date: 2018-01-25 21:09 BIC: 12376.8269
No. Observations: 11036 Log-Likelihood: -6174.5
Df Model: 2 LL-Null: -6230.0
Df Residuals: 11033 LLR p-value: 7.6243e-25
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -0.2492 0.3413 -0.7301 0.4654 -0.9181 0.4198
state_color[T.r] -0.4638 0.0449 -10.3253 0.0000 -0.5519 -0.3758
np.log10(population) -0.0911 0.0485 -1.8780 0.0604 -0.1861 0.0040
1 Privacy
Optimization terminated successfully.
         Current function value: 0.571952
         Iterations 5
Model: Logit Pseudo R-squared: 0.002
Dependent Variable: url_type AIC: 24687.9996
Date: 2018-01-25 21:09 BIC: 24711.9377
No. Observations: 21577 Log-Likelihood: -12341.
Df Model: 2 LL-Null: -12361.
Df Residuals: 21574 LLR p-value: 2.9467e-09
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -1.0333 0.2144 -4.8190 0.0000 -1.4536 -0.6131
state_color[T.r] -0.2024 0.0325 -6.2358 0.0000 -0.2660 -0.1388
np.log10(population) 0.0088 0.0307 0.2865 0.7745 -0.0515 0.0691
2 Vaccine
Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5
Model: Logit Pseudo R-squared: 0.004
Dependent Variable: url_type AIC: 14580.9189
Date: 2018-01-25 21:09 BIC: 14603.7378
No. Observations: 14858 Log-Likelihood: -7287.5
Df Model: 2 LL-Null: -7319.0
Df Residuals: 14855 LLR p-value: 1.9532e-14
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -0.1177 0.3301 -0.3565 0.7215 -0.7646 0.5292
state_color[T.r] -0.3293 0.0441 -7.4693 0.0000 -0.4157 -0.2429
np.log10(population) -0.1670 0.0462 -3.6135 0.0003 -0.2575 -0.0764

In [11]:
model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
model.summary2()


Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5
Out[11]:
Model: Logit Pseudo R-squared: 0.004
Dependent Variable: url_type AIC: 14580.9189
Date: 2018-01-25 21:09 BIC: 14603.7378
No. Observations: 14858 Log-Likelihood: -7287.5
Df Model: 2 LL-Null: -7319.0
Df Residuals: 14855 LLR p-value: 1.9532e-14
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -0.1177 0.3301 -0.3565 0.7215 -0.7646 0.5292
state_color[T.r] -0.3293 0.0441 -7.4693 0.0000 -0.4157 -0.2429
np.log10(population) -0.1670 0.0462 -3.6135 0.0003 -0.2575 -0.0764

In [12]:
def significance_stars(p):
    """
    ns P > 0.05
    * P ≤ 0.05
    ** P ≤ 0.01
    *** P ≤ 0.001
    **** P ≤ 0.0001 (For the last two choices only)
    """
    if p > 0.05:
        return ""
    if p >= 0.01:
        return "*"
    if p > 0.001:
        return "**"
    if p > 0.0001:
        return "***"
    return "****"

In [13]:
print "{} $(\\beta_{{Rep}}={:.3f}^{{{}}})$".format(
    topic,
    model.summary2().tables[1].loc["state_color[T.r]", "Coef."],
    significance_stars(model.summary2().tables[1].loc["state_color[T.r]", "P>|z|"]))


Vaccine $(\beta_{Rep}=-0.329^{****})$

In [14]:
def plot_by_topic(df, nstates=10):
    url_types = ["fakenews", "news",]
    ncols = len(topic_order[:3])
    nrows = len(url_types)
    w, h = 6, 6
    fig, ax = plt.subplots(
        nrows, ncols,
        sharex=True,
        #sharey="col",
        figsize=(ncols*w,nrows*h)
    )
    for ui, url_type in enumerate(url_types):
        print ui, url_type
        for i, topic in enumerate(topic_order[:3]):
            print i, topic
            df_t = (
                df[
                    (df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                ]
                .assign(**{
                    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
                })
                [["u_state", url_type]].groupby("u_state")[url_type]
                .agg([np.sum, len, np.mean])
                .reset_index()
                .rename(columns={
                    "sum": "success",
                    "len": "total",
                    "mean": "proportion"
                })
                .assign(
                    failure=lambda x: x["total"] - x["success"],
                    population=lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                    std_err=lambda x: np.sqrt(x["proportion"]*(1-x["proportion"])/x["total"])
                )
                .dropna()
                .assign(
                    log_odds=lambda x: logit_transform(x["proportion"]),
                    #population=lambda x: np.log10(x["population"])
                )
            )
            display(df_t.sort_values("proportion", ascending=False).head(10))
            X, Y, yerr = df_t.population.values, df_t.proportion.values, df_t.std_err.values
            """ax[ui, i].errorbar(
                X, Y, yerr=yerr,
                linestyle="none", elinewidth=1,
                color="k", marker="o", ms=5,
                alpha=0.5
            )"""
            ax[ui, i].plot(
                X, Y,
                linestyle="none",
                color="k", marker="o", ms=5,
                alpha=0.5
            )
            overall_p = df_t.success.sum()*1./df_t.total.sum()
            overall_err=np.sqrt(overall_p*(1-overall_p)/df_t.total.sum())
            #ax[i].axhline(y=logit_transform(overall_p), ls="--", color="0.5")
            ax[ui, i].axhline(y=overall_p, ls="--", lw=1, color="0.5")
            ax[ui, i].axhspan(
                overall_p-overall_err, overall_p+overall_err,
                color="0.9", alpha=0.3
            )
            #ax[ui, i].axhline(y=overall_p, ls="--", lw=1, color="0.5")
            #ax[i, ui].set_xlabel("$log_{10}(population)$")
            ax[ui, i].set_xscale("log")
            ax[ui, i].set_ylabel("$p(url={})$".format(url_type))
            for j, state in enumerate(df_t.u_state.values):
                ax[ui, i].text(
                    X[j], Y[j], state,
                    fontsize=14, color=STATE_COLORS[state], alpha=0.7
                )
            ## Fit model and plot
            df_t = (df[
                        (df.u_state != "USA")
                      & (~df.u_state.isnull())
                      & (df.t_n_urls > 0)
                      & (df.topic_name == topic)
                    ]
                    .assign(**{
                        "url_type": lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0) > 0).astype("int"),
                        "population": lambda x: x.u_state.map(STATE_POPULATIONS, 1),
                        "state_color": lambda x: x.u_state.map(STATE_COLORS, "k")
                    })).query("state_color!='k'")
            model = sm.Logit.from_formula("url_type ~ np.log10(population) + state_color", data=df_t).fit()
            display(model.summary2())   
            for sc in ["r", "b"]:
                df_tt = pd.DataFrame({
                    "population": 10**np.arange(5.5, 8.1, 0.1)
                }).assign(state_color=sc)
                x = df_tt.population
                y = model.predict(df_tt)
                ax[ui, i].plot(x, y, color=sc, linestyle="--", lw=2, alpha=0.7)
            title = "{} $(\\beta_{{Rep}}={:.3f}^{{{}}})$".format(
                topic,
                model.summary2().tables[1].loc["state_color[T.r]", "Coef."],
                significance_stars(model.summary2().tables[1].loc["state_color[T.r]", "P>|z|"])
            )
            ax[ui, i].set_title(title)
        #fig.suptitle(url_type.upper(), fontsize=16)
        #plt.subplots_adjust(hspace=0.2, wspace=0.3)
    sns.despine(offset=10)
    fig.tight_layout()
    plt.savefig("Population Party Proportion.pdf", bbox_inches="tight")
nstates=None

In [15]:
plot_by_topic(df, nstates=nstates)


0 fakenews
0 Gun Control
u_state success total proportion failure population std_err log_odds
43 SD 5 10 0.500000 5 865454.0 0.158114 0.000000
0 AK 5 14 0.357143 9 741894.0 0.128060 -0.587787
1 AL 32 130 0.246154 98 4863300.0 0.037781 -1.119232
51 WV 4 20 0.200000 16 1831102.0 0.089443 -1.386294
50 WI 41 206 0.199029 165 5778708.0 0.027818 -1.392373
47 VA 61 330 0.184848 269 8411808.0 0.021368 -1.483837
13 IA 12 67 0.179104 55 3134693.0 0.046845 -1.522426
10 FL 126 707 0.178218 581 20612439.0 0.014393 -1.528469
45 TX 167 938 0.178038 771 27862596.0 0.012491 -1.529695
44 TN 29 165 0.175758 136 6651194.0 0.029631 -1.545359
Optimization terminated successfully.
         Current function value: 0.386068
         Iterations 6
Model: Logit Pseudo R-squared: 0.006
Dependent Variable: url_type AIC: 8527.2848
Date: 2018-01-25 21:09 BIC: 8549.2115
No. Observations: 11036 Log-Likelihood: -4260.6
Df Model: 2 LL-Null: -4287.5
Df Residuals: 11033 LLR p-value: 2.1182e-12
Converged: 1.0000 Scale: 1.0000
No. Iterations: 6.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.2820 0.4656 -4.9009 0.0000 -3.1946 -1.3694
state_color[T.r] 0.4146 0.0567 7.3075 0.0000 0.3034 0.5258
np.log10(population) 0.0262 0.0660 0.3974 0.6911 -0.1031 0.1556
1 Privacy
u_state success total proportion failure population std_err log_odds
0 AK 8 41 0.195122 33 741894.0 0.061891 -1.417066
1 AL 33 219 0.150685 186 4863300.0 0.024174 -1.729239
21 ME 9 88 0.102273 79 1331479.0 0.032301 -2.172223
12 IA 12 139 0.086331 127 3134693.0 0.023822 -2.359280
32 NJ 43 500 0.086000 457 8944469.0 0.012538 -2.363483
27 MT 4 47 0.085106 43 1042520.0 0.040702 -2.374906
50 WI 20 257 0.077821 237 5778708.0 0.016711 -2.472328
51 WV 2 26 0.076923 24 1831102.0 0.052259 -2.484907
23 MN 15 200 0.075000 185 5519952.0 0.018625 -2.512306
9 FL 92 1252 0.073482 1160 20612439.0 0.007374 -2.534387
Optimization terminated successfully.
         Current function value: 0.214723
         Iterations 7
Model: Logit Pseudo R-squared: 0.001
Dependent Variable: url_type AIC: 9272.1480
Date: 2018-01-25 21:09 BIC: 9296.0862
No. Observations: 21577 Log-Likelihood: -4633.1
Df Model: 2 LL-Null: -4636.0
Df Residuals: 21574 LLR p-value: 0.053933
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.2916 0.4109 -5.5778 0.0000 -3.0969 -1.4864
state_color[T.r] 0.1250 0.0607 2.0604 0.0394 0.0061 0.2439
np.log10(population) -0.0846 0.0591 -1.4307 0.1525 -0.2005 0.0313
2 Vaccine
u_state success total proportion failure population std_err log_odds
32 NM 14 57 0.245614 43 2081015.0 0.057015 -1.122143
5 CO 51 232 0.219828 181 5540545.0 0.027189 -1.266671
33 NV 38 176 0.215909 138 2940058.0 0.031014 -1.289667
21 ME 14 74 0.189189 60 1331479.0 0.045529 -1.455287
49 WI 37 197 0.187817 160 5778708.0 0.027827 -1.464256
3 AZ 47 265 0.177358 218 6931071.0 0.023464 -1.534347
0 AK 3 17 0.176471 14 741894.0 0.092459 -1.540445
13 ID 7 40 0.175000 33 1683140.0 0.060078 -1.550597
6 CT 15 90 0.166667 75 3576452.0 0.039284 -1.609438
31 NJ 53 336 0.157738 283 8944469.0 0.019885 -1.675155
Optimization terminated successfully.
         Current function value: 0.312787
         Iterations 6
Model: Logit Pseudo R-squared: 0.002
Dependent Variable: url_type AIC: 9300.7812
Date: 2018-01-25 21:09 BIC: 9323.6000
No. Observations: 14858 Log-Likelihood: -4647.4
Df Model: 2 LL-Null: -4656.8
Df Residuals: 14855 LLR p-value: 7.8778e-05
Converged: 1.0000 Scale: 1.0000
No. Iterations: 6.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -2.5120 0.4729 -5.3114 0.0000 -3.4390 -1.5851
state_color[T.r] 0.2491 0.0573 4.3469 0.0000 0.1368 0.3613
np.log10(population) 0.0214 0.0660 0.3251 0.7451 -0.1078 0.1507
1 news
0 Gun Control
u_state success total proportion failure population std_err log_odds
8 DC 194 429 0.452214 235 681170.0 0.024030 -0.191727
2 AR 24 68 0.352941 44 2988248.0 0.057952 -0.606136
35 NY 394 1154 0.341421 760 19745289.0 0.013959 -0.656968
29 ND 2 6 0.333333 4 757952.0 0.192450 -0.693147
48 VT 7 21 0.333333 14 624594.0 0.102869 -0.693147
15 IL 132 421 0.313539 289 12801539.0 0.022611 -0.783625
12 HI 9 29 0.310345 20 1428557.0 0.085909 -0.798508
49 WA 122 394 0.309645 272 7288000.0 0.023293 -0.801781
6 CO 73 239 0.305439 166 5540545.0 0.029793 -0.821528
0 AK 4 14 0.285714 10 741894.0 0.120736 -0.916291
Optimization terminated successfully.
         Current function value: 0.559483
         Iterations 5
Model: Logit Pseudo R-squared: 0.009
Dependent Variable: url_type AIC: 12354.9002
Date: 2018-01-25 21:09 BIC: 12376.8269
No. Observations: 11036 Log-Likelihood: -6174.5
Df Model: 2 LL-Null: -6230.0
Df Residuals: 11033 LLR p-value: 7.6243e-25
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -0.2492 0.3413 -0.7301 0.4654 -0.9181 0.4198
state_color[T.r] -0.4638 0.0449 -10.3253 0.0000 -0.5519 -0.3758
np.log10(population) -0.0911 0.0485 -1.8780 0.0604 -0.1861 0.0040
1 Privacy
u_state success total proportion failure population std_err log_odds
8 DE 10 22 0.454545 12 952065.0 0.106159 -0.182322
48 VT 16 48 0.333333 32 624594.0 0.068041 -0.693147
28 NC 168 508 0.330709 340 10146788.0 0.020874 -0.704982
24 MO 62 189 0.328042 127 6093000.0 0.034151 -0.717053
11 HI 27 84 0.321429 57 1428557.0 0.050957 -0.747214
7 DC 574 1846 0.310943 1272 681170.0 0.010773 -0.795716
19 MA 184 607 0.303130 423 6811779.0 0.018655 -0.832436
31 NH 31 103 0.300971 72 1334795.0 0.045195 -0.842679
29 ND 3 10 0.300000 7 757952.0 0.144914 -0.847298
33 NM 18 61 0.295082 43 2081015.0 0.058395 -0.870828
Optimization terminated successfully.
         Current function value: 0.571952
         Iterations 5
Model: Logit Pseudo R-squared: 0.002
Dependent Variable: url_type AIC: 24687.9996
Date: 2018-01-25 21:09 BIC: 24711.9377
No. Observations: 21577 Log-Likelihood: -12341.
Df Model: 2 LL-Null: -12361.
Df Residuals: 21574 LLR p-value: 2.9467e-09
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -1.0333 0.2144 -4.8190 0.0000 -1.4536 -0.6131
state_color[T.r] -0.2024 0.0325 -6.2358 0.0000 -0.2660 -0.1388
np.log10(population) 0.0088 0.0307 0.2865 0.7745 -0.0515 0.0691
2 Vaccine
u_state success total proportion failure population std_err log_odds
42 SD 4 10 0.400000 6 865454.0 0.154919 -0.405465
19 MA 154 447 0.344519 293 6811779.0 0.022477 -0.643220
32 NM 17 57 0.298246 40 2081015.0 0.060596 -0.855666
0 AK 5 17 0.294118 12 741894.0 0.110510 -0.875469
20 MD 64 230 0.278261 166 6016447.0 0.029550 -0.953105
12 IA 20 72 0.277778 52 3134693.0 0.052786 -0.955511
7 DC 106 385 0.275325 279 681170.0 0.022765 -0.967773
40 RI 11 40 0.275000 29 1056426.0 0.070600 -0.969401
34 NY 456 1660 0.274699 1204 19745289.0 0.010956 -0.970912
29 NE 10 37 0.270270 27 1907116.0 0.073009 -0.993252
Optimization terminated successfully.
         Current function value: 0.490474
         Iterations 5
Model: Logit Pseudo R-squared: 0.004
Dependent Variable: url_type AIC: 14580.9189
Date: 2018-01-25 21:09 BIC: 14603.7378
No. Observations: 14858 Log-Likelihood: -7287.5
Df Model: 2 LL-Null: -7319.0
Df Residuals: 14855 LLR p-value: 1.9532e-14
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
Intercept -0.1177 0.3301 -0.3565 0.7215 -0.7646 0.5292
state_color[T.r] -0.3293 0.0441 -7.4693 0.0000 -0.4157 -0.2429
np.log10(population) -0.1670 0.0462 -3.6135 0.0003 -0.2575 -0.0764
/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [16]:
pd.Series(STATE_COLORS.values()).value_counts()


Out[16]:
r    30
b    21
dtype: int64

In [17]:
df[
    (df.u_state != "USA")
  & (~df.u_state.isnull()) 
  & (df.t_n_urls > 0)
  #& (df.topic_name == topic)
].assign(state_color=lambda x: x.u_state.map(STATE_COLORS)).groupby("state_color").t_n_urls.count()


Out[17]:
state_color
b    38521
r    29255
Name: t_n_urls, dtype: int64

In [18]:
df.CATS_Counter.head()


Out[18]:
0    {u'NONE': 1}
1    {u'NONE': 1}
2    {u'NONE': 1}
3    {u'NONE': 1}
4    {u'NONE': 1}
Name: CATS_Counter, dtype: object

In [19]:
df.assign(
    UNK_count=df.CATS_Counter.apply(lambda x: x.get("UNK", 0))
).groupby("is_controversial").UNK_count.agg([np.sum, np.mean])


Out[19]:
sum mean
is_controversial
False 9799 0.099780
True 21192 0.142551

In [25]:
df_state_votes = pd.DataFrame(
    STATE_VOTES[1:],
    columns=[
        "State", "Total",
        "Hillary Votes", "Hillary Percent", "Hillary EV",
        "Trump Votes", "Trump Percent", "Trump EV",
        "Other Votes", "Other Percent", "Other EV",
    ]
).astype({
    "State": "str", "Total": "int",
        "Hillary Votes": "int", "Hillary Percent": "float", "Hillary EV": "str",
        "Trump Votes": "int", "Trump Percent": "float", "Trump EV": "str",
        "Other Votes": "int", "Other Percent": "float", "Other EV": "str",
}).assign(hill_trump_diff=lambda x: x["Hillary Percent"] - x["Trump Percent"])
df_state_votes.head()


Out[25]:
State Total Hillary Votes Hillary Percent Hillary EV Trump Votes Trump Percent Trump EV Other Votes Other Percent Other EV hill_trump_diff
0 Alabama 2123372 729547 34.4 1318255 62.1 9 44467 2.1 -27.7
1 Alaska 318608 116454 36.6 163387 51.3 3 18725 5.9 -14.7
2 Arizona 2573165 1161167 45.1 1252401 48.7 11 106327 4.1 -3.6
3 Arkansas 1130635 380494 33.7 684872 60.6 6 29829 2.6 -26.9
4 California 14181595 8753788 61.7 55 4483810 31.6 478500 3.4 30.1

In [26]:
df_state_votes.sort_values("hill_trump_diff").head(10)


Out[26]:
State Total Hillary Votes Hillary Percent Hillary EV Trump Votes Trump Percent Trump EV Other Votes Other Percent Other EV hill_trump_diff
50 Wyoming 255849 55973 21.9 174419 68.2 3 13287 5.2 None -46.3
48 West Virginia 713051 188794 26.5 489371 68.6 5 23004 3.2 -42.1
36 Oklahoma 1452992 420375 28.9 949136 65.3 7 83481 5.7 -36.4
34 North Dakota 344360 93758 27.2 216794 63.0 3 21434 6.2 -35.8
12 Idaho 690255 189765 27.5 409055 59.3 4 28331 4.1 -31.8
41 South Dakota 370093 117458 31.7 227721 61.5 3 20850 5.6 -29.8
17 Kentucky 1924149 628854 32.7 1202971 62.5 8 53752 2.8 -29.8
0 Alabama 2123372 729547 34.4 1318255 62.1 9 44467 2.1 -27.7
3 Arkansas 1130635 380494 33.7 684872 60.6 6 29829 2.6 -26.9
42 Tennessee 2508027 870695 34.7 1522925 60.7 11 70397 2.8 -26.0

In [27]:
df_state_votes.sort_values("hill_trump_diff", ascending=False).head(10)


Out[27]:
State Total Hillary Votes Hillary Percent Hillary EV Trump Votes Trump Percent Trump EV Other Votes Other Percent Other EV hill_trump_diff
8 Dist. of Col. 311268 282830 90.9 3 12723 4.1 4906 1.6 86.8
11 Hawaii 428937 266891 62.2 3* 128847 30.0 15954 3.7 32.2
4 California 14181595 8753788 61.7 55 4483810 31.6 478500 3.4 30.1
21 Massachusetts 3325046 1995196 60.0 11 1090893 32.8 138018 4.2 27.2
45 Vermont 315067 178573 56.7 3 95369 30.3 10078 3.2 26.4
20 Maryland 2781446 1677928 60.3 10 943169 33.9 79605 2.9 26.4
32 New York 7721453 4556124 59.0 29 2819534 36.5 176598 2.3 22.5
13 Illinois 5536424 3090729 55.8 20 2146015 38.8 209596 3.8 17.0
47 Washington 3209214 1742718 54.3 8* 1221747 38.1 160879 5.0 16.2
39 Rhode Island 464144 252525 54.4 4 180543 38.9 14746 3.2 15.5

In [30]:
df_state_votes.hill_trump_diff.hist(bins=np.arange(-100, 105, 5))


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3299e94150>

In [31]:
(df_state_votes.hill_trump_diff > 0).value_counts()


Out[31]:
False    30
True     21
Name: hill_trump_diff, dtype: int64

In [33]:
df_state_votes[df_state_votes.State.isin([
    "Alaska", "Alabama", "Iowa",
    "Delaware", "Vermont", "Hawaii"
])]


Out[33]:
State Total Hillary Votes Hillary Percent Hillary EV Trump Votes Trump Percent Trump EV Other Votes Other Percent Other EV hill_trump_diff
0 Alabama 2123372 729547 34.4 1318255 62.1 9 44467 2.1 -27.7
1 Alaska 318608 116454 36.6 163387 51.3 3 18725 5.9 -14.7
7 Delaware 441590 235603 53.4 3 185127 41.9 14757 3.3 11.5
11 Hawaii 428937 266891 62.2 3* 128847 30.0 15954 3.7 32.2
15 Iowa 1566031 653669 41.7 800983 51.1 6 59186 3.8 -9.4
45 Vermont 315067 178573 56.7 3 95369 30.3 10078 3.2 26.4

In [ ]: